Import libraries and dataset
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
## load in the data, this is the same as in the book:
## load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
"modify" : "roi",
"modify.1" : "level4",
"modify.2" : "level3",
"modify.3" : "level2",
"modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
## There's some whitespace mucking things up
multilevel_lookup['level2'] = multilevel_lookup['level2'].str.strip()
## load in the usbject and merge
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv").drop(['Unnamed: 0'], axis = 1)
## I think this will be useful later, it's just a list of ROIs
rois = subjectData[ ['roi', 'level'] ][(subjectData.type == 1) & (subjectData.id == id)]
rois = ['ICV'] + rois['roi'].tolist()
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
## Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(level0 = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head(5)
| roi | volume | level4 | level3 | level2 | level1 | level0 | comp | |
|---|---|---|---|---|---|---|---|---|
| 0 | SFG_L | 12926 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009350 |
| 1 | SFG_R | 10050 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.007270 |
| 2 | SFG_PFC_L | 12783 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009247 |
| 3 | SFG_PFC_R | 11507 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.008324 |
| 4 | SFG_pole_L | 3078 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.002227 |
fig = px.sunburst(subjectData, path =['level0', 'level1', 'level2', 'level3', 'level4','roi'], values='comp', width=800, height=800)
fig.show()
/opt/tljh/user/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True) /opt/tljh/user/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True) /opt/tljh/user/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True) /opt/tljh/user/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True) /opt/tljh/user/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True) /opt/tljh/user/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
## Get the data ready for the sankey diagram
## Level 0 to Level 1
l1 = subjectData.groupby(['level1', 'level0']).sum().reset_index()
l1 = l1.rename(columns = {'level1' : 'target', 'level0' : 'source'}).drop(['volume'], axis = 1)
## Level 1 to Level 2
l2 = subjectData.groupby(['level2', 'level1']).sum().reset_index()
l2 = l2.rename(columns = {'level2' : 'target', 'level1' : 'source'}).drop(['volume'], axis = 1)
## Concatenate the datasets
sankey_dat = pd.concat([l1, l2])
## Add a list of integers to index the regions rather than their names
sankey_dat['target_idx'] = [rois.index(x) for x in sankey_dat['target']]
sankey_dat['source_idx'] = [rois.index(x) for x in sankey_dat['source']]
## Now look at this data, should be relatively easy to convert to the required format
sankey_dat
| target | source | comp | target_idx | source_idx | |
|---|---|---|---|---|---|
| 0 | CSF | ICV | 0.079417 | 8 | 0 |
| 1 | Diencephalon_L | ICV | 0.008548 | 3 | 0 |
| 2 | Diencephalon_R | ICV | 0.008362 | 4 | 0 |
| 3 | Mesencephalon | ICV | 0.007430 | 5 | 0 |
| 4 | Metencephalon | ICV | 0.115313 | 6 | 0 |
| 5 | Myelencephalon | ICV | 0.003599 | 7 | 0 |
| 6 | Telencephalon_L | ICV | 0.384220 | 1 | 0 |
| 7 | Telencephalon_R | ICV | 0.393111 | 2 | 0 |
| 0 | BasalForebrain_L | Diencephalon_L | 0.003960 | 15 | 3 |
| 1 | BasalForebrain_R | Diencephalon_R | 0.003753 | 16 | 4 |
| 2 | CerebralCortex_L | Telencephalon_L | 0.200361 | 9 | 1 |
| 3 | CerebralCortex_R | Telencephalon_R | 0.204623 | 10 | 2 |
| 4 | CerebralNucli_L | Telencephalon_L | 0.008956 | 11 | 1 |
| 5 | CerebralNucli_R | Telencephalon_R | 0.009460 | 12 | 2 |
| 6 | Mesencephalon_L | Mesencephalon | 0.003577 | 17 | 5 |
| 7 | Mesencephalon_R | Mesencephalon | 0.003853 | 18 | 5 |
| 8 | Metencephalon_L | Metencephalon | 0.057506 | 20 | 6 |
| 9 | Metencephalon_R | Metencephalon | 0.057807 | 19 | 6 |
| 10 | Myelencephalon_L | Myelencephalon | 0.001738 | 21 | 7 |
| 11 | Myelencephalon_R | Myelencephalon | 0.001861 | 22 | 7 |
| 12 | Sulcus_L | CSF | 0.024577 | 26 | 8 |
| 13 | Sulcus_R | CSF | 0.021714 | 27 | 8 |
| 14 | Thalamus_L | Diencephalon_L | 0.004588 | 13 | 3 |
| 15 | Thalamus_R | Diencephalon_R | 0.004609 | 14 | 4 |
| 16 | Ventricle | CSF | 0.033126 | 25 | 8 |
| 17 | WhiteMatter_L | Telencephalon_L | 0.174904 | 23 | 1 |
| 18 | WhiteMatter_R | Telencephalon_R | 0.179029 | 24 | 2 |
san_labels = ["ICV"] + sankey_dat['target'].tolist()
# icv = [sum(subjofint.volume)]
# san_valuelist = icv + type1.volume[1:491].tolist()
# target_list = [i for i in range(1,491)]
fig = go.Figure(data=[go.Sankey(
valueformat = ".0f",
valuesuffix = "TWh",
# Define nodes
node = dict(
pad = 15,
thickness = 15,
line = dict(color = "black", width = 0.5),
label = san_labels
),
# Add links
link = dict(
source = sankey_dat['source_idx'],
target = sankey_dat['target_idx'],
value = sankey_dat['comp'],
label = san_labels,
# color = data['data'][0]['link']['color']
))])
fig.update_layout(title_text="Brain composition",
font_size=10)
fig.show(engine = "collab")
My public webpage
import sqlite3 as sq3
con = sq3.connect("opioid.db")
# cursor() creates an object that can execute functions in the sqlite cursor
sql = con.cursor()
annual = pd.read_sql_query("select * from annual", con)
annual = annual.iloc[: , 1:]
annual[annual['countyfips'].isnull()].loc(['BUYER_STATE'] != 'PR')
# for row in sql.execute("select * from annual where countyfips = 'NA' and BUYER_STATE != 'PR' limit 10;"):
# print(row)
annual.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
Inspect the missing data further on your own. It looks like its the unincorporated territories and a handful of Arkansas values missing countyfips (Federal Information Processing Standard). Specifically, Montgomery county AR is missing FIPs codes. Since we want to look US states in specific, excluding territories, we will just set the Montgomery county ones to the correct value 05097 and ignore the other missing values.
# fix = annual.loc[(annual.BUYER_STATE == 'AR') & (annual.BUYER_COUNTY == 'MONTGOMERY')].assign(countyfips = '05097')
annual.loc[(annual['BUYER_STATE'] == 'AR') & (annual['BUYER_COUNTY'] == 'MONTGOMERY'), 'countyfips'] = '05097'
# ("select * from annual where BUYER_STATE = 'AR' and BUYER_COUNTY = 'MONTGOMERY'", con)
# annual = annual.merge(fix, on = 'BUYER_COUNTY', how = 'left')
annual.head()
#annual.tail()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
Now lets delete rows from the annual table that have missing county data. Check on these counties before and verify that the’ve been deleted afterwards. Also, we want to grab just three columns from the land table, so let’s create a new one called land_area. Also, the column there is called STCOU, which we want to rename to coutyfips. (I’m going to stop printing out the results of every step, so make sure you’re checking your work as you go.)
annual = annual.replace('NA', np.NaN)
annual = annual.dropna(axis = 0, subset='BUYER_COUNTY')
# annual.BUYER_COUNTY.isnull().values.any()
# annual.head()
annual.tail()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 55495 | ZAVALA | TX | 2010 | 248 | 200100 | 48507 |
| 55496 | ZAVALA | TX | 2011 | 406 | 244800 | 48507 |
| 55497 | ZAVALA | TX | 2012 | 473 | 263700 | 48507 |
| 55498 | ZAVALA | TX | 2013 | 399 | 186700 | 48507 |
| 55499 | ZAVALA | TX | 2014 | 162 | 148930 | 48507 |
# sql.execute("create table land_area as select Areaname, STCOU, LND110210D from land;")
#sql.execute("alter table land_area rename column STCOU to countyfips;")
land_area = pd.read_sql_query("select * from land_area", con)
con.close
land_area.head()
land_area.tail()
| Areaname | countyfips | LND110210D | |
|---|---|---|---|
| 3193 | Sweetwater, WY | 56037 | 10426.65 |
| 3194 | Teton, WY | 56039 | 3995.38 |
| 3195 | Uinta, WY | 56041 | 2081.26 |
| 3196 | Washakie, WY | 56043 | 2238.55 |
| 3197 | Weston, WY | 56045 | 2398.09 |
county_info = annual.merge(land_area, on='countyfips', how='left')
county_info.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | Areaname | LND110210D | |
|---|---|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 | Abbeville, SC | 490.48 |
| 1 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 | Abbeville, SC | 490.48 |
| 2 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 | Abbeville, SC | 490.48 |
| 3 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 | Abbeville, SC | 490.48 |
| 4 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 | Abbeville, SC | 490.48 |
county_info.tail()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | Areaname | LND110210D | |
|---|---|---|---|---|---|---|---|---|
| 55478 | ZAVALA | TX | 2010 | 248 | 200100 | 48507 | Zavala, TX | 1297.41 |
| 55479 | ZAVALA | TX | 2011 | 406 | 244800 | 48507 | Zavala, TX | 1297.41 |
| 55480 | ZAVALA | TX | 2012 | 473 | 263700 | 48507 | Zavala, TX | 1297.41 |
| 55481 | ZAVALA | TX | 2013 | 399 | 186700 | 48507 | Zavala, TX | 1297.41 |
| 55482 | ZAVALA | TX | 2014 | 162 | 148930 | 48507 | Zavala, TX | 1297.41 |
Create an interactive scatter plot of average number of opiod pills by year plot using plotly. See the example here. Don't do the intervals (little vertical lines), only the points. Add your plot to an html file with your repo for your Sanky diagram and host it publicly. Put a link to your hosted file in a markdown cell of your hw4.ipynb file. Note, an easy way to create a webpage with this graphic is to export an ipynb as an html file.
county_info = county_info.dropna(axis = 0)
county_info["DOSAGE_UNIT"] = pd.to_numeric(county_info["DOSAGE_UNIT"])
county_info = county_info.assign(Pills_in_millions = county_info.DOSAGE_UNIT/1000000)
state_info = county_info[["BUYER_STATE", "year","BUYER_COUNTY","Pills_in_millions"]].groupby(by=["BUYER_STATE", "year"], as_index = False).mean()
print(state_info)
BUYER_STATE year Pills_in_millions 0 AK 2006 0.864895 1 AK 2007 1.009090 2 AK 2008 1.237714 3 AK 2009 1.250153 4 AK 2010 1.301771 .. ... ... ... 454 WY 2010 0.819123 455 WY 2011 0.866664 456 WY 2012 0.928649 457 WY 2013 0.930924 458 WY 2014 0.940087 [459 rows x 3 columns]
raw_state_avg = px.scatter(data_frame= state_info, x = "year", y = "Pills_in_millions")
raw_state_avg.show(engine = "collab")
! jupyter nbconvert --to html hw4.ipynb
[NbConvertApp] Converting notebook hw4.ipynb to html [NbConvertApp] Writing 4416095 bytes to hw4.html
My public webpage